# of distinct accounts appearing in tweets over
total # of distinct friends that each user has. Now, it’s
# of distinct accounts appearing in tweets over
total # of distinct accounts that appeared in tweets. That
is, the denominator is now the total number of distinct accounts that
each user “sees” (not “has”).# of tweets collected in all plots. However, the problem is
that this x-axis is not comparable across users since they have
different number of friends - e.g. some users follow lots of friends
thus naturally, they get more number of tweets; some others follow only
few accounts thereby getting few number of tweets. Since the y-axis is
in fraction/relative terms, the x-axis is also rescaled.
# of tweets collected)
by the average tweets per second of each user.Load pacakges
library(readr)
library(tidyverse)
library(ggplot2)
library(ggthemes)
library(grid)
library(gridExtra)
library(DT)
library(lubridate)
Prepare data
# load data
df <- read_csv("df.csv")
# define data type
df %>%
mutate(
user_id = as.factor(user_id),
tweet_id = as.factor(tweet_id),
friend_id = as.factor(account_id)
) %>%
dplyr::select(-account_id) -> df
# what is the maximum of user_friends_count?
df %>%
dplyr::select(user_id, user_friends_count) %>%
distinct() %>%
group_by(user_id) %>%
mutate(max_friends_count = max(user_friends_count)) %>%
dplyr::select(-user_friends_count) %>%
distinct() -> max_data
# max_data: user_id - max_friends_count
# merge this 'max_data' into df
df %>%
merge(max_data, by="user_id") -> df
# clean timestamp data to have affinity with R lubridate pacakge
list_timestamp <- str_split(df$tweet_timestamp, " ")
# length(list_timestamp) # 756404 = nrow(df)
Month = c()
Day = c()
Time = c()
Year = c()
timestamp_dmyt = c()
for (i in 1:length(list_timestamp)) {
list_timestamp[[i]][2] -> Month[i]
list_timestamp[[i]][3] -> Day[i]
list_timestamp[[i]][4] -> Time[i]
list_timestamp[[i]][6] -> Year[i]
}
timestamp_dmyt = as.data.frame(cbind(Day, Month, Year, Time))
dmyt = c()
for (i in 1:nrow(timestamp_dmyt)) {
dmyt[i] = paste(Day[i], Month[i], Year[i], Time[i])
}
df$tweet_timestamp = dmy_hms(dmyt)
# define x-axis: number of tweets collected
df %>%
arrange(tweet_timestamp) %>%
group_by(user_id) %>%
count(tweet_id) %>%
mutate(
n_tweets = cumsum(n),
max_n_tweets = max(n_tweets)
) %>%
dplyr::select(
user_id, tweet_id, n_tweets, max_n_tweets
) -> df_for_x
df %>%
inner_join(df_for_x, by=c("user_id", "tweet_id")) %>%
arrange(user_id, tweet_timestamp) -> df
#* [X] Re-scaling → divide x axis by the average tweets per second of each participant.
#* For each participant, (1) take the first and last tweet in the data and compute the number of seconds between them, and then (2) divide the total number of tweets seen for the participant by the number of seconds.
df |>
group_by(user_id) |>
summarise(timediff = max(tweet_timestamp) - min(tweet_timestamp)) -> timeDiff
df |>
merge(timeDiff, by="user_id") |>
group_by(user_id) |>
mutate(
avg_n_tweets_persec = max_n_tweets / as.numeric(timediff)
) |>
ungroup() |>
mutate(
x = n_tweets/ avg_n_tweets_persec
) -> df2
# define y-axis: count how many distinct accounts are in the tweets (numerator)
#* [Y] make a fraction for y-axis (new denominator as Brendan suggested : maximum of the total distinct accounts you "see" (not you have) as a new denominator to make all 60 individuals reach 1 in the end (individual plots))
df2 %>%
arrange(user_id, tweet_timestamp) %>%
group_by(user_id) %>%
mutate(
numerator = cumsum(!duplicated(friend_id)),
y = numerator / max(numerator)
) -> df2
# df2 is the final data for drawing plots
# n_tweets: original x variable (=number of tweets collected so far)
# x: new x variable (=n_tweets/avg_n_tweets_persec)
# y: fraction of distinct accounts appeared in tweets
# plot 1.
df2 %>%
group_by(user_id) %>%
ggplot(aes(x=n_tweets, y=y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 1a")
df2 %>%
group_by(user_id) %>%
ggplot(aes(x=x, y=y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected / Avg # of Tweets per sec") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 1b: x-axis re-scaled")
df2 %>%
group_by(user_id) %>%
ggplot(aes(x=x, y=y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("Log(# of Tweets Collected / Avg # of Tweets per sec)") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_log10(n.breaks=15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 1c: x-axis re-scaled + logged")
In the data frame (df2), there are 60 unique users. Let’s redraw the plot 1a by user. I separated the samples into smaller chunks. I also allowed scales of the x-axis to vary for each user.
df2 %>%
mutate(numeric_user_id = as.integer(user_id)) %>%
filter(numeric_user_id < 31) %>% # from 1~30
ggplot(aes(x=n_tweets, y=y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 5) +
facet_wrap(~user_id, nrow=10, ncol=6, scales="free_x") +
ggtitle("Plot 2a: First 30 users")
df2 %>%
mutate(numeric_user_id = as.integer(user_id)) %>%
filter(numeric_user_id > 30) %>% # from 31~60
ggplot(aes(x=n_tweets, y=y, col=user_id)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 5) +
facet_wrap(~user_id, nrow=10, ncol=6, scales="free_x") +
ggtitle("Plot 2b: Second 30 users")
# aggregate plot: mean of y-axis by each point of x
df2 %>%
group_by(n_tweets) %>%
summarize(y = mean(y)) %>%
ungroup() %>%
ggplot(aes(x=n_tweets, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Mean Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 3a")
df2 %>%
group_by(n_tweets) %>%
summarize(y = mean(y)) %>%
ungroup() %>%
ggplot(aes(x=n_tweets, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("Log(# of Tweets Collected)") +
ylab("Mean Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_log10(n.breaks=15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 3b: x-axis logged")
## with rescaled x-axis?
df2 %>%
group_by(x) %>%
summarize(y = mean(y)) %>%
ungroup() %>%
ggplot(aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected / Avg # of Tweets per sec") +
ylab("Mean Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 3c: x-axis rescaled")
df2 %>%
group_by(x) %>%
summarize(y = mean(y)) %>%
ungroup() %>%
ggplot(aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("Log(# of Tweets Collected / Avg # of Tweets per sec)") +
ylab("Mean Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_log10(n.breaks=15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 3d: x-axis rescaled & logged")
For the rescaled x-axis, the plots are not very informative.
They are basically the same with plots 1b and 1c - this is probably
due to the fact that avg # of tweets per sec is highly
distinct among users thus dividing # of tweets collected by
it makes x-axis distinct by users. Thus, grouping by each point at the
x-axis and summarizing the mean value of y is almost useless.
With this caveat in mind, let’s zoom in the left part of the plot to see our focus of interest (the elbow point). I cut the rescaled x-axis by the median and show the lower part (x < median).
df2 %>%
group_by(x) %>%
summarize(y = mean(y)) %>%
filter(x < median(x)) %>%
ggplot(aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected / Avg # of Tweets per sec") +
ylab("Mean Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 3e: x-axis rescaled & less than median")
df2 %>%
group_by(user_id) %>%
mutate(fd=y-lag(y)) %>%
ggplot(aes(x=n_tweets, y=fd)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("First-Differenced Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 4a: FD")
It seems some people follow very few accounts while some others follow very many accounts. Let’s check distribution of the friends counts as well as maximum number of accounts observed in collected tweets.
df2 %>%
group_by(user_id) %>%
mutate(max_accounts_seen = max(numerator)) %>%
distinct(user_id, max_friends_count, max_accounts_seen) %>%
arrange(-desc(max_friends_count)) -> table_dta
datatable(table_dta,
caption = "User - Max Friends Count - Max Distinct Accounts Seen in Tweets Collected",
filter="top")
Let’s remove users whose max_accounts_seen is less than 10 & more than 1,000 - and re-draw some of the plots.
df2 %>%
group_by(user_id) %>%
mutate(max_accounts_seen = max(numerator)) %>%
filter(max_accounts_seen >= 10 & max_accounts_seen <= 1000) %>%
group_by(n_tweets) %>%
mutate(y=mean(y)) %>%
ggplot(aes(x=n_tweets, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("Mean Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 5a: w/o outliers")
df2 %>%
group_by(user_id) %>%
mutate(max_accounts_seen = max(numerator)) %>%
filter(max_accounts_seen >= 10 & max_accounts_seen <= 1000) %>%
group_by(x) %>%
summarize(y=mean(y)) %>%
ggplot(aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected / Avg # of Tweets per sec") +
ylab("Mean Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 5b: w/o outliers + x-axis rescaled")
df2 %>%
group_by(user_id) %>%
mutate(max_accounts_seen = max(numerator)) %>%
filter(max_accounts_seen >= 10 & max_accounts_seen <= 1000) %>%
group_by(x) %>%
summarize(y=mean(y)) %>%
filter(x < median(x)) %>%
ggplot(aes(x=x, y=y)) +
geom_point(alpha=0.5) +
geom_smooth(color='darkcyan', linewidth=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected / Avg # of Tweets per sec") +
ylab("Mean Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 5c: w/o outliers + x-axis rescaled & less than median")
df2 %>%
group_by(user_id) %>%
mutate(max_accounts_seen = max(numerator)) %>%
filter(max_accounts_seen >= 10 & max_accounts_seen <= 1000) %>%
mutate(fd=y-lag(y)) %>%
ggplot(aes(x=n_tweets, y=fd)) +
geom_point(alpha=0.5) +
theme_few() +
theme(legend.position="none") +
xlab("# of Tweets Collected") +
ylab("First-Differenced Fraction of Distinct Accounts Appearing in Tweets (%)") +
scale_x_continuous(n.breaks = 15, label = scales::label_number(accuracy = 1)) +
ggtitle("Plot 5d: FD w/o outliers")
When x-axis is the number of tweets, interpretation and the following decision are easier: e.g. 5,000 tweets would be a potential criterion to stop pulling per user.
However, when the x-axis is
# of tweets collected/avg # of tweets per sec, both
interpretation and deciding when to stop are a bit fuzzy.
Any revisions to make?
Which plots do you want me to replicate with the low-quality sources?